Imports

In [31]:
import requests
import scipy.stats
import pandas as pd
import numpy as np
import json
from pandas import read_csv
from statsmodels.graphics.tsaplots import plot_acf
from pandas import datetime
from pandas import DataFrame
from statsmodels.tsa.arima_model import ARIMA
from matplotlib import pyplot 
from datetime import datetime
from pmdarima.arima import auto_arima
import seaborn as sns
import warnings
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
warnings.filterwarnings("ignore")
URL = 'http://167.172.183.67'
USERNAME = "Group10"
PASSWORD = "q7NQ4hVoe7EUfxHV"
submit_now = False 

def get_token(username, password):
    body = {
        "username": username,
        "password": password
    }
    r = requests.post(f'{URL}/token/', data=body)
    r = r.json()
    token = r["key"]
    return token
def get_data(token, start_date='2020-03-20'):
    # Date format : YEAR-MONTH-DAY
    header = {'Authorization': f'Token {token}'}
    r = requests.get(f'{URL}/dataset/', params={'start_date': start_date}, headers=header)
    r = r.json()
    data = pd.DataFrame.from_dict(r)
    data["event_date"] = pd.to_datetime(data["event_date"])
    data = data.sort_values(by=["event_date"])
    return data

token = get_token(USERNAME,PASSWORD)
def get_new_data(token):
    new_data = get_data(token)
    new_unique_dates = new_data["event_date"].unique()
    new_data["product_content_id"] = new_data["product_content_id"].apply(lambda x: int(x))
    current_data = pd.read_csv("challenge_unchanged.csv")
    current_data["event_date"] = current_data["event_date"].apply(lambda x: datetime.strptime(x,"%Y-%m-%d"))
    current_unique_dates = current_data["event_date"].unique()
    for date in new_unique_dates:
        if(date not in current_unique_dates):
            temp = new_data[new_data["event_date"]==date]
            current_data =pd.concat([temp,current_data],ignore_index=True)
    return new_data
In [7]:
data = get_new_data(token)
product_grouped = data.groupby("product_content_id")
date_grouped = data.groupby("event_date")
unique_products = data["product_content_id"].unique()
unique_dates = data["event_date"].unique()
dis = 32939029
yuz = 85004
islak = 4066298
kulaklik = 6676673
supurge = 7061886
tayt = 31515569
bikini = 5926527
mont = 3904356
prods = [kulaklik,yuz,islak, supurge, mont, bikini, tayt, dis]
prod_names = ['kulaklik','yuz','islak', 'supurge', 'mont', 'bikini', 'tayt', 'dis']
def func(row):
    for i in range(8):
        if (row== prods[i]):
            val = prod_names[i]
            return val

data["names"] = data["product_content_id"].apply(func)
day = datetime.today()
data.to_csv("daily_data/"+str(day.day) + "_may_data.csv")
data["visit_conversion_rate"] = data["sold_count"]/ data["visit_count"]
data["favored_conversion_rate"] = data["sold_count"]/ data["favored_count"]
data["basket_conversion_rate"] = data["sold_count"]/ data["basket_count"]

data =data.sort_values(["event_date","names"],ascending = False).reset_index().drop("index",axis = 1)
data["twodays_lagged_favored"] = data["favored_count"].shift(-16)
data["twodays_lagged_basket"] = data["basket_count"].shift(-16)
data["twodays_lagged_visit"] = data["visit_count"].shift(-16)
data["twodays_lagged_category_visits"] = data["category_visits"].shift(-16)

data["threedays_lagged_favored"] = data["favored_count"].shift(-24)
data["threedays_lagged_basket"] = data["basket_count"].shift(-24)

names_grouped = data.groupby("names")
unique_names = data["names"].unique()

Exploratory

In [3]:
# set parameters for graph
init_date = [2020,3,1]
last_date = [2020,5,datetime.today().day]
import plotly.graph_objs as go
fig = go.Figure()
traces = []
for products,k in data.groupby("names"):
    traces.append(go.Scatter(x=k.event_date, y=k.sold_count, name=products, mode='lines'))

fig = go.Figure(data = traces)
fig.update_layout(xaxis_range=[datetime(init_date[0], init_date[1], init_date[2]),
                               datetime(last_date[0], last_date[1], last_date[2])],
                 yaxis_range = [0,2000],title = "Sales vs Time")
fig.show()
plot(fig,filename = "plots/sales_vs_time.html")
Out[3]:
'plots/sales_vs_time.html'
In [6]:
# set parameters for graph
init_date = [2020,3,1]
last_date = [2020,5,datetime.today().day]
import plotly.graph_objs as go
fig = go.Figure()
traces = []
for products,k in data.groupby("names"):
    traces.append(go.Scatter(x=k.event_date, y=k.visit_conversion_rate, name=products, mode='lines'))

fig = go.Figure(data = traces)
fig.update_layout(xaxis_range=[datetime(init_date[0], init_date[1], init_date[2]),
                               datetime(last_date[0], last_date[1], last_date[2])],
                 yaxis_range = [0,0.2],title = "Visit_Conversion_Rate")
fig.show()
plot(fig,filename = "plots/Visit_Conversion_Rate.html")
Out[6]:
'plots/Visit_Conversion_Rate.html'
In [10]:
# set parameters for graph
init_date = [2020,4,1]
last_date = [2020,5,datetime.today().day]
import plotly.graph_objs as go
fig = go.Figure()
traces = []
for products,k in data.groupby("names"):
    traces.append(go.Scatter(x=k.event_date, y=k.visit_count, name=products, mode='lines'))

fig = go.Figure(data = traces)
fig.update_layout(xaxis_range=[datetime(init_date[0], init_date[1], init_date[2]),
                               datetime(last_date[0], last_date[1], last_date[2])],
                 yaxis_range = [0,50000],
                  title = "Visit vs Time")
fig.show()
plot(fig,filename = "plots/Visit_vs_Time.html")
Out[10]:
'plots/Visit_vs_Time.html'
In [11]:
# set parameters for graph
init_date = [2020,3,1]
last_date = [2020,6,3]
import plotly.graph_objs as go
fig = go.Figure()
traces = []
for products,k in data.groupby("names"):
    traces.append(go.Scatter(x=k.event_date, y=k.category_visits, name=products, mode='lines'))

fig = go.Figure(data = traces)
fig.update_layout(xaxis_range=[datetime(init_date[0], init_date[1], init_date[2]),
                               datetime(last_date[0], last_date[1], last_date[2])],
                  title = "Categorty_Visits")
fig.show()
In [41]:
plot_acf(data.groupby("names").get_group('bikini')['sold_count'],lags = 7)
Out[41]:

Rolling Function

In [12]:
def rolling_analys(name,data = data,size=30,window=7):
    #temp = data.iloc[0:8*size]
    temp1 = data
    #temp1 = temp[temp["names" ] == name]
    #temp1.reset_index(inplace = True)
    temp1['z_data'] = (temp1['sold_count'] - temp1.sold_count.rolling(window=window).mean()) / temp1.sold_count.rolling(window=window).std()
    temp1['zp_data'] = temp1['z_data'] - temp1['z_data'].shift(window)

    fig, ax = pyplot.subplots(3,figsize=(12, 9))
    ax[0].set_title(name + ' window: ' + str(window) + " raw rolling")
    ax[0].plot(temp1.index, temp1.sold_count, label='raw data')
    ax[0].plot(temp1.sold_count.rolling(window=window).mean(), label="rolling mean");
    ax[0].plot(temp1.sold_count.rolling(window=window).std(), label="rolling std ");
    ax[0].legend()

    ax[1].set_title(name + ' window: ' + str(window) + " de-trended rolling")
    ax[1].plot(temp1.index, temp1.z_data, label="de-trended data")
    ax[1].plot(temp1.z_data.rolling(window=window).mean(), label="rolling mean");
    ax[1].plot(temp1.z_data.rolling(window=window).std(), label="rolling std ");
    ax[1].legend()

    ax[2].set_title(name + ' window: ' + str(window) + " lag differenced de-trended")
    ax[2].plot(temp1.index, temp1.zp_data, label= str(window) + " lag differenced de-trended data")
    ax[2].plot(temp1.zp_data.rolling(window=window).mean(), label="rolling mean");
    ax[2].plot(temp1.zp_data.rolling(window=window).std(), label="rolling std ");
    ax[2].legend()
    
    #return {"z_mean" :temp1.z_data.rolling(window=window).mean().mean(),"z_std": temp1.z_data.rolling(window=window).std()}
In [16]:
to_roll = names_grouped.get_group("kulaklik").sort_values("event_date",ascending = False).reset_index(drop = True)
rolling_analys("kulaklik",to_roll.iloc[0:60],window = 7)

Statistical Analysis

In [143]:
to_plot = []
for name in unique_names:
    to_plot.append(names_grouped.get_group(name).iloc[0:30][[
        "price","sold_count","twodays_lagged_favored","visit_count",
        "twodays_lagged_basket","twodays_lagged_visit"]])
for i in range(8):
    g = sns.pairplot(data=to_plot[i],
                 kind="reg",y_vars = "sold_count",
                 x_vars =["price","twodays_lagged_favored","twodays_lagged_basket",
                          "visit_count","twodays_lagged_visit"])
    
    g.fig.suptitle(unique_names[i], y=1.08)
In [4]:
to_plot = []
for name in unique_names:
    to_plot.append(
        names_grouped.get_group(name).iloc[0:15][[
            "category_sold", "sold_count", "category_brand_sold",
            "category_visits", "ty_visits","twodays_lagged_category_visits"
        ]])
for i in range(8):
    g = sns.pairplot(data=to_plot[i],
                     kind="reg",
                     y_vars="sold_count",
                     x_vars=[
                         "category_sold", "category_brand_sold",
                         "category_visits", "ty_visits","twodays_lagged_category_visits"
                     ])

    g.fig.suptitle(unique_names[i], y=1.08)
In [12]:
corr_coefs = pd.DataFrame(index = unique_names, columns =  data.columns)
p_values = pd.DataFrame(index = unique_names, columns =  data.columns)
for name in unique_names:
    temp = names_grouped.get_group(name).iloc[0:60]["sold_count"]
    for column in data.columns:
        try:
            temp2 = names_grouped.get_group(name).iloc[0:60][column]
            corr_coefs.at[name,column]  = np.corrcoef(temp,temp2)[0,1]
            p_values.at[name,column] = scipy.stats.pearsonr(temp, temp2)[1]
        except:
            continue
corr_coefs.drop(["event_date","product_content_id","sold_count","names"],inplace = True, axis = 1)
significants = {}
for name in unique_names:
    significant = []
    for column in corr_coefs.columns:
        if((p_values.at[name,column] <0.1)):
            if ((corr_coefs.at[name,column] > 0.2) or (corr_coefs.at[name,column] <-0.2)):
                significant.append(column)
    significants[name] = significant
In [13]:
corr_coefs
Out[13]:
price visit_count favored_count basket_count category_sold category_brand_sold category_visits ty_visits visit_conversion_rate favored_conversion_rate basket_conversion_rate twodays_lagged_favored twodays_lagged_basket twodays_lagged_visit twodays_lagged_category_visits threedays_lagged_favored threedays_lagged_basket
yuz -0.488798 0.708768 0.689005 0.849631 0.805753 0.691646 0.701791 0.667513 0.599543 0.341016 0.298756 0.429136 0.5441 0.508334 0.386057 0.496752 0.460169
tayt 0.232782 0.717491 0.825197 0.94934 0.629061 0.864069 0.524785 0.278864 0.67611 NaN 0.033901 0.182443 0.207259 0.201577 0.105172 0.0648735 0.04893
supurge -0.433985 0.787849 0.717931 0.871812 0.385718 0.512941 0.369172 0.419524 0.368825 0.159445 0.25063 0.537189 0.524301 0.442969 0.117297 0.223603 0.19964
mont NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
kulaklik -0.584743 0.689061 0.72179 0.934583 0.453498 0.977164 0.593568 0.542499 0.827452 0.826483 0.654697 0.368256 0.424489 0.326109 0.278565 0.150512 0.229724
islak -0.557468 0.855011 0.882281 0.969365 0.768332 0.896898 0.839319 0.514824 0.375044 0.205853 -0.113828 0.18939 0.180034 0.143533 0.0782971 0.0370457 0.0150578
dis 0.191628 0.947635 0.937609 0.981434 0.789958 0.801831 0.831278 0.492018 0.688163 0.586579 NaN 0.791976 0.825454 0.777356 0.6646 0.688908 0.757395
bikini 0.757221 0.457341 0.564261 0.625256 0.552995 0.452419 0.485486 -0.00753934 0.691306 0.424623 NaN 0.366983 0.260918 0.314876 0.486227 0.177601 0.190553